Import required libraries
library("tidyverse")
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.3.0 [32m✓[30m [34mpurrr [30m 0.3.3
[32m✓[30m [34mtibble [30m 3.0.0 [32m✓[30m [34mdplyr [30m 0.8.5
[32m✓[30m [34mtidyr [30m 1.0.2 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.5.0[39m
[30m── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library("dplyr")
# read data and drop ID col
dataset_1 <- read.csv("../raw_data/dataset_1/winemag-data-130k-v2.csv")[-1]
Combine all the csv files for dataset_2
# function reads csv values
readfile <- function(filename){
cat("Reading file: ", filename, "...\n", sep = '' )
return(read.csv(file = filename, header = TRUE))
}
# function merges two dataframes together
merge_dataframes <- function(dataframe_1, dataframe_2){
return(rbind(dataframe_1, dataframe_2))
}
# funciton merges all csv files in folder to one dataframe
combine_data = function(mypath){
filenames=list.files(path=mypath, full.names=TRUE)
datalist = lapply(filenames, function(x) readfile(x))
return(Reduce(f = function(x,y) merge_dataframes(x,y), x = datalist, accumulate = FALSE))
}
# merge csv data given dataset folder
dataset_2 <- combine_data("../raw_data/dataset_2")
Reading file: ../raw_data/dataset_2/winemag-1-800.csv...
Reading file: ../raw_data/dataset_2/winemag-10401-11200.csv...
Reading file: ../raw_data/dataset_2/winemag-11201-12000.csv...
Reading file: ../raw_data/dataset_2/winemag-12001-12800.csv...
Reading file: ../raw_data/dataset_2/winemag-1601-2400.csv...
Reading file: ../raw_data/dataset_2/winemag-2401-3200.csv...
Reading file: ../raw_data/dataset_2/winemag-3201-4000.csv...
Reading file: ../raw_data/dataset_2/winemag-4001-4800.csv...
Reading file: ../raw_data/dataset_2/winemag-4801-5600.csv...
Reading file: ../raw_data/dataset_2/winemag-5601-6400.csv...
Reading file: ../raw_data/dataset_2/winemag-6401-7200.csv...
Reading file: ../raw_data/dataset_2/winemag-7201-8000.csv...
Reading file: ../raw_data/dataset_2/winemag-8001-8800.csv...
Reading file: ../raw_data/dataset_2/winemag-801-1600.csv...
Reading file: ../raw_data/dataset_2/winemag-8801-9600.csv...
Reading file: ../raw_data/dataset_2/winemag-9601-10400.csv...
Join datasets by title, description, price, country, points = rating. We choose to join by these characteristics, because it would allow us to safely assume that the wine reviews being merged are the same. However, we intentionally left choose not to join based on certain chacteristics like province = region, because there were some input errors in the data. For example, in dataset_1 the province was “Sicily & Sardinia” where as in dataset_2 the region was “Sicily & Sardinia”.
join_wines <- inner_join(dataset_1, dataset_2, by = c("title", "description", "price", "country", "points" = "rating"))
Column `title` joining factors with different levels, coercing to character vectorColumn `description` joining factors with different levels, coercing to character vectorColumn `country` joining factors with different levels, coercing to character vector
Visualize joined data
head(join_wines)
Clean joined data to remove redundancies
join_wines_cleaned <- join_wines %>%
select(country, description, designation=designation.x, points, price, province, region=region_1, subregion=region_2, taster_name, taster_twitter_handle, title, variety, winery=winery.x, alcohol, category, url, vintage)
head(join_wines_cleaned)
Extract relevant information for our use
wines <- join_wines_cleaned %>%
select(title, alcohol, category, vintage, designation, country, province, region, subregion, variety, winery, price, points, taster_name, taster_twitter_handle)
head(wines)
Output uncleaned joined data for others to use
write_csv(join_wines, "joined_datasets.csv")
Outout cleaned joined data for others to use
write_csv(join_wines_cleaned, "joined_datasets_cleaned.csv")
Output for easy import later
write_csv(wines, "wines.csv")